Import the data

#First, load the file
zillow = data.frame(read.csv("price.csv"))
head(zillow)

Remove blank pricing data

# Remove empty rows, NA's are only present if no data was collected for the location
zillow = na.omit(zillow)

Subset the Zillow data into east coast and west coast and annual subsets of the regions.

# going through and discarding states that are not defined by the Uited States as West or East coast states

eastcoast = subset(zillow, State == 'CT' | State == 'NY' | State == 'VA' | State == 'FL' | State == 'NJ' | State == 'NC' |State == 'SC' | State == 'GA' | State == 'DE' | State == 'MA' | State == 'MD' | State == 'ME' | State == 'RI' | State == 'NH')
eastcoast_annual = eastcoast[,c(1:81)]
eastcoast_annual = eastcoast_annual[-c(8,9,10,11,12,13,14,15,16,17,18)]
eastcoast_annual = eastcoast_annual[-c(9,10,11,12,13,14,15,16,17,18,19)]
eastcoast_annual = eastcoast_annual[-c(10,11,12,13,14,15,16,17,18,19,20)]
eastcoast_annual = eastcoast_annual[-c(11,12,13,14,15,16,17,18,19,20,21)]
eastcoast_annual = eastcoast_annual[-c(12,13,14,15,16,17,18,19,20,21,22)]
eastcoast_annual = eastcoast_annual[-c(13,14,15,16,17,18,19,20,21,22,23)]
eastcoast_annual = eastcoast_annual[-c(14,15)]

westcoast = subset(zillow, State == 'CA' | State == 'OR' | State == 'WA' | State == 'HI' | State == 'AK')

westcoast_annual = westcoast[,c(1:81)]
westcoast_annual = westcoast_annual[-c(8,9,10,11,12,13,14,15,16,17,18)]
westcoast_annual = westcoast_annual[-c(9,10,11,12,13,14,15,16,17,18,19)]
westcoast_annual = westcoast_annual[-c(10,11,12,13,14,15,16,17,18,19,20)]
westcoast_annual = westcoast_annual[-c(11,12,13,14,15,16,17,18,19,20,21)]
westcoast_annual = westcoast_annual[-c(12,13,14,15,16,17,18,19,20,21,22)]
westcoast_annual = westcoast_annual[-c(13,14,15,16,17,18,19,20,21,22,23)]
westcoast_annual = westcoast_annual[-c(14,15)]

westcoast = westcoast[-c(2,3,4,5,6)]

westAnn = westcoast_annual
westAnn = westAnn[-c(2,3,4,5,6)]

eastAnn = eastcoast_annual
eastAnn = eastAnn[-c(2,3,4,5,6)]

Find the difference between of rent price and add it as a column AND add average price of rent over the years column

# use dplyr package to create new columns Price.Diff and Avg.Price
eastcoast_annual <- eastcoast_annual %>%
    mutate(Price.Diff = November.2016 - November.2010)

westcoast_annual <- westcoast_annual %>%
    mutate(Price.Diff = November.2016 - November.2010)

eastcoast_annual <- eastcoast_annual %>%
     mutate(Avg.Price = rowMeans(eastcoast_annual[7:13]))

westcoast_annual <- westcoast_annual %>%
     mutate(Avg.Price = rowMeans(westcoast_annual[7:13]))

Clean Data of Outliers

# clean the data
westcoast_annualClean = outlierKD2(westcoast_annual, November.2010, rm = TRUE)

Outliers identified: 64 Proportion (%) of outliers: 5.5 Mean of the outliers: 5337 Mean without removing outliers: 1733 Mean if we remove outliers: 1534 Outliers successfully removed

westcoast_annualClean = outlierKD2(westcoast_annualClean, November.2016, rm = TRUE)

Outliers identified: 70 Proportion (%) of outliers: 6.1 Mean of the outliers: 6518 Mean without removing outliers: 2093 Mean if we remove outliers: 1823 Outliers successfully removed

westcoast_annualClean = outlierKD2(westcoast_annualClean, Avg.Price, rm = TRUE)

Outliers identified: 72 Proportion (%) of outliers: 6.3 Mean of the outliers: 5489 Mean without removing outliers: 1837 Mean if we remove outliers: 1608 Outliers successfully removed

westcoast_annualClean = outlierKD2(westcoast_annualClean, Price.Diff, rm = TRUE)

Outliers identified: 76 Proportion (%) of outliers: 6.6 Mean of the outliers: 1512 Mean without removing outliers: 359 Mean if we remove outliers: 283 Outliers successfully removed

eastcoast_annualClean = outlierKD2(eastcoast_annual, November.2010, rm = TRUE)

Outliers identified: 171 Proportion (%) of outliers: 4.5 Mean of the outliers: 3417 Mean without removing outliers: 1458 Mean if we remove outliers: 1370 Outliers successfully removed

eastcoast_annualClean = outlierKD2(eastcoast_annualClean, November.2016, rm = TRUE)

Outliers identified: 162 Proportion (%) of outliers: 4.2 Mean of the outliers: 3953 Mean without removing outliers: 1598 Mean if we remove outliers: 1498 Outliers successfully removed

eastcoast_annualClean = outlierKD2(eastcoast_annualClean, Avg.Price, rm = TRUE)

Outliers identified: 171 Proportion (%) of outliers: 4.5 Mean of the outliers: 3583 Mean without removing outliers: 1513 Mean if we remove outliers: 1420 Outliers successfully removed

eastcoast_annualClean = outlierKD2(eastcoast_annualClean, Price.Diff, rm = TRUE)

Outliers identified: 168 Proportion (%) of outliers: 4.4 Mean of the outliers: 595 Mean without removing outliers: 140 Mean if we remove outliers: 120 Outliers successfully removed

Transpose Data Correctly

rownames(westAnn) = westAnn$City.Code
westAnn = westAnn[c(-1)]
westAnn = as.data.frame(t(westAnn))
names(westAnn)<-str_replace_all(names(westAnn), c(" " = "" , "," = ""))
westAnn <- cbind(Date = rownames(westAnn), westAnn)
rownames(westAnn) <- 1:nrow(westAnn)
head(westAnn)


rownames(eastAnn) = eastAnn$City.Code
eastAnn = eastAnn[c(-1)]
eastAnn = as.data.frame(t(eastAnn))
names(eastAnn)<-str_replace_all(names(eastAnn), c(" " = "" , "," = ""))
eastAnn <- cbind(Date = rownames(eastAnn), eastAnn)
rownames(eastAnn) <- 1:nrow(eastAnn)
head(eastAnn)

Summary of the two regions annually

# getting the summaries to view basic stats of the data
summary(eastcoast_annual[7:15])
summary(westcoast_annual[7:15])
xkablesummary(westcoast_annual[7:15], title="West Coast Summary Annually")
xkablesummary(eastcoast_annual[7:15], title="East Coast Summary Annually")

xkablesummary(westcoast_annual[c(7,10,13:15)], title="West Coast Summary Annually")
xkablesummary(eastcoast_annual[c(7,10,13:15)], title="East Coast Summary Annually")

Finding the Outliers to Help with Graphs

# checking outliers
outWest2010 <- boxplot.stats(westcoast_annual$November.2010)$out
outWest2016 <- boxplot.stats(westcoast_annual$November.2016)$out
outEast2010 <- boxplot.stats(eastcoast_annual$November.2010)$out
outEast2016 <- boxplot.stats(eastcoast_annual$November.2016)$out
min(outWest2010)
min(outWest2016)
min(outEast2010)
min(outEast2016)

West Coast Annual Novemeber 2010 Start and November 2016 End Histogram

library(ggplot2)
ggplot(westcoast_annualClean, aes(November.2010)) + geom_histogram(fill="red",alpha = 0.4,bins= 70) + 
    labs(title = "Rental Price Count on the West Coast in November 2010", x = "Rental Prices", y = "Frequency")

ggplot(westcoast_annual, aes(November.2010)) + geom_histogram(fill="red",alpha = 0.4,bins= 70) + 
    labs(title = "Rental Price Count on the West Coast in November 2010", x = "Rental Prices", y = "Frequency")

ggplot(westcoast_annual, aes(November.2016)) + geom_histogram(fill="blue",alpha = 0.4,bins= 70) + 
    labs(title = "Rental Price Count on the West Coast in November 2016", x = "Rental Prices", y = "Frequency")

ggplot(westcoast_annualClean, aes(November.2016)) + geom_histogram(fill="blue",alpha = 0.4,bins= 70) + 
    labs(title = "Rental Price Count on the West Coast in November 2016", x = "Rental Prices", y = "Frequency")

ggplot(westcoast_annual) + 
    geom_histogram(aes(November.2010), fill = "red", alpha = 0.4, bins = 70) + 
    geom_histogram(aes(November.2016), fill = "blue", alpha = 0.4, bins = 70) +
    labs(title = "Rental Price Count on the West Coast from November 2010 to November 2016", x = "Rental Prices", y = "Frequency")

colors <- c("November.2010" = "red", "November.2016" = "blue")

ggplot(westcoast_annualClean) + 
    geom_histogram(aes(November.2010, fill = "November.2010"),alpha = 0.4, bins = 40) + 
    geom_histogram(aes(November.2016, fill = "November.2016"),alpha = 0.4, bins = 40) + 
    labs(title = "Rental Price Count on the West Coast from November 2010 to November 2016", x = "Rental Prices", 
         y = "Frequency", fill = "Legend") +
    scale_color_manual(values = colors)

East Coast Annual Novemeber 2010 Start and November 2016 End Histogram

ggplot(eastcoast_annual, aes(November.2010)) + geom_histogram(fill="red",alpha = 0.4,bins= 40) +  
    labs(title = "Rental Price Count on the East Coast from November 2010", x = "Rental Prices", y = "Frequency")

ggplot(eastcoast_annualClean, aes(November.2010)) + geom_histogram(fill="red",alpha = 0.4,bins= 40) +  
    labs(title = "Rental Price Count on the East Coast from November 2010", x = "Rental Prices", y = "Frequency")

ggplot(eastcoast_annual, aes(November.2016)) + geom_histogram(fill="red",alpha = 0.4,bins= 40) +  
    labs(title = "Rental Price Count on the East Coast from November 2010", x = "Rental Prices", y = "Frequency")

ggplot(eastcoast_annualClean, aes(November.2016)) + geom_histogram(fill="blue",alpha = 0.4,bins= 40) +  
    labs(title = "Rental Price Count on the East Coast from November 2016", x = "Rental Prices", y = "Frequency")

ggplot(eastcoast_annual) + 
    geom_histogram(aes(November.2010), fill = "red", alpha = 0.4, bins = 40) + 
    geom_histogram(aes(November.2016), fill = "blue", alpha = 0.4, bins = 40) +
    labs(title = "Rental Price Count on the East Coast from November 2010 to November 2016", x = "Rental Prices", y = "Frequency")

colors <- c("November.2010" = "red", "November.2016" = "blue")

ggplot(eastcoast_annualClean) + 
    geom_histogram(aes(November.2010, fill = "November.2010"),alpha = 0.4, bins = 40) + 
    geom_histogram(aes(November.2016, fill = "November.2016"),alpha = 0.4, bins = 40) + 
    labs(title = "Rental Price Count on the East Coast from November 2010 to November 2016", x = "Rental Prices", 
         y = "Frequency", fill = "Legend") +
    scale_color_manual(values = colors)

Boxplot of West Coast Zillow Rental Prices 2010-2016

ggplot(westcoast_annual, aes(State, November.2010, color=State)) + geom_boxplot() + labs(title = "West Coast States November 2010 Rent Prices with Outliers")

ggplot(westcoast_annualClean, aes(State, November.2010, color=State)) + geom_boxplot()+labs(title = "West Coast States November 2010 Rent Prices without Outliers")

ggplot(westcoast_annualClean, aes(x=November.2010)) + geom_boxplot(color="blue", outlier.shape = 9, outlier.size = 2, outlier.color = "blue") + labs(title="Entire West Coast November 2010 Boxplot") + scale_x_continuous(breaks=seq(500, 4000, 500))

ggplot(westcoast_annual, aes(State, November.2016, color=State)) + geom_boxplot()+labs(title = "West Coast States November 2016 Rent Prices with Outliers")

ggplot(westcoast_annualClean, aes(State, November.2016, color=State)) + geom_boxplot()+labs(title = "West Coast States November 2016 Rent Prices without Outliers")

ggplot(westcoast_annualClean, aes(x=November.2016)) + geom_boxplot(color="blue", outlier.shape = 9, outlier.size = 2, outlier.color = "blue") + labs(title="Entire West Coast November 2016 Boxplot") + scale_x_continuous(breaks=seq(500, 4000, 500))

Boxplot of East Coast Zillow Rental Prices 2010-2016

ggplot(eastcoast_annual, aes(State, November.2010, color=State)) + geom_boxplot() + labs(title = "East Coast States November 2010 Rent Prices with Outliers")

ggplot(eastcoast_annualClean, aes(State, November.2010, color=State)) + geom_boxplot()+labs(title = "East Coast States November 2010 Rent Prices without Outliers")

ggplot(eastcoast_annualClean, aes(x=November.2010)) + geom_boxplot(color="red", outlier.shape = 9, outlier.size = 2, outlier.color = "red") + labs(title="Entire East Coast November 2010 Boxplot") + scale_x_continuous(breaks=seq(500, 4000, 500))

ggplot(eastcoast_annual, aes(State, November.2016, color=State)) + geom_boxplot()+labs(title = "East Coast States November 2016 Rent Prices with Outliers")

ggplot(eastcoast_annualClean, aes(State, November.2016, color=State)) + geom_boxplot()+labs(title = "East Coast States November 2016 Rent Prices without Outliers")

ggplot(eastcoast_annualClean, aes(x=November.2016)) + geom_boxplot(color="red", outlier.shape = 9, outlier.size = 2, outlier.color = "red") + labs(title="Entire East Coast November 2016 Boxplot") + scale_x_continuous(breaks=seq(500, 4000, 500))

Scatter plot of West Coast

ggplot(westcoast_annual, aes(November.2010,November.2016, color=State)) + geom_point() + labs(title="West Coast November 2010 and November 2016 with Outliers", x = "November.2010 Rent Prices", y = "November.2016 Rent Prices")

ggplot(westcoast_annualClean, aes(November.2010,November.2016, color=State)) + geom_point() + labs(title="West Coast November 2010 and November 2016 without Outliers", x = "November.2010 Rent Prices", y = "November.2016 Rent Prices")

Scatter plot of East Coast

ggplot(eastcoast_annual, aes(November.2010,November.2016, color=State)) + geom_point() + labs(title="East Coast November 2010 and November 2016 with Outliers", x = "November.2010 Rent Prices", y = "November.2016 Rent Prices")

ggplot(eastcoast_annualClean, aes(November.2010,November.2016, color=State)) + geom_point() + labs(title="East Coast November 2010 and November 2016 without Outliers", x = "November.2010 Rent Prices", y = "November.2016 Rent Prices")

QQ Plots without Outliers

qqnorm(westcoast_annualClean$November.2010, main="QQ-Plot West Coast November 2010 without Outliers")
qqline(westcoast_annualClean$November.2010)

qqnorm(westcoast_annualClean$November.2016, main="QQ-Plot West Coast November 2016 without Outliers")
qqline(westcoast_annualClean$November.2016)

qqnorm(eastcoast_annualClean$November.2010, main="QQ-Plot East Coast November 2010 without Outliers")
qqline(eastcoast_annualClean$November.2010)

qqnorm(eastcoast_annualClean$November.2016, main="QQ-Plot East Coast November 2016 without Outliers")
qqline(eastcoast_annualClean$November.2016)

## QQ Plots with Outliers

qqnorm(westcoast_annual$November.2010, main="QQ-Plot West Coast November 2010 with Outliers")
qqline(westcoast_annual$November.2010)

qqnorm(westcoast_annual$November.2016, main="QQ-Plot West Coast November 2016 with Outliers")
qqline(westcoast_annual$November.2016)

qqnorm(eastcoast_annual$November.2010, main="QQ-Plot East Coast November 2010 with Outliers")
qqline(eastcoast_annual$November.2010)

qqnorm(eastcoast_annual$November.2016, main="QQ-Plot East Coast November 2016 with Outliers")
qqline(eastcoast_annual$November.2016)

Format Date

westAnn$Date = gsub('\\.',' ',westAnn$Date)
westAnn <- westAnn %>%
     mutate(Avg.Price = rowMeans(westAnn[2:1222]))
westAnn$Max = rowMaxs(as.matrix(westAnn[2:1222]))
westAnn$Min = rowMins(as.matrix(westAnn[2:1222]))
ggplot(westAnn, aes(Date, Avg.Price)) + geom_point() + labs(title="Average Price of the Years on the West Coast")

eastAnn$Date = gsub('\\.',' ',eastAnn$Date)
eastAnn <- eastAnn %>%
     mutate(Avg.Price = rowMeans(eastAnn[2:3989]))
eastAnn$Max = rowMaxs(as.matrix(eastAnn[2:3989]))
eastAnn$Min = rowMins(as.matrix(eastAnn[2:3989]))
ggplot(eastAnn, aes(Date, Avg.Price)) + geom_point() + labs(title="Average Price of the Years on the East Coast")

westAnnAvg = westAnn[-c(2:1222)]
eastAnnAvg = eastAnn[-c(2:3989)]

T Test of First and Last Month of East and West Coast

loadPkg("BSDA") # for z.test
west2010_80 = t.test(x=westcoast_annualClean$November.2010, conf.level = 0.80)
west2010_80
One Sample t-test

data: westcoast_annualClean$November.2010 t = 106, df = 1156, p-value <2e-16 alternative hypothesis: true mean is not equal to 0 80 percent confidence interval: 1515 1553 sample estimates: mean of x 1534

west2016_80 = t.test(x=westcoast_annualClean$November.2016, conf.level = 0.80)
west2016_80
One Sample t-test

data: westcoast_annualClean$November.2016 t = 90, df = 1150, p-value <2e-16 alternative hypothesis: true mean is not equal to 0 80 percent confidence interval: 1798 1849 sample estimates: mean of x 1823

west2010_99 = t.test(x=westcoast_annualClean$November.2010, conf.level = 0.99)
west2010_99
One Sample t-test

data: westcoast_annualClean$November.2010 t = 106, df = 1156, p-value <2e-16 alternative hypothesis: true mean is not equal to 0 99 percent confidence interval: 1497 1571 sample estimates: mean of x 1534

west2016_99 = t.test(x=westcoast_annualClean$November.2016, conf.level = 0.99)
west2016_99
One Sample t-test

data: westcoast_annualClean$November.2016 t = 90, df = 1150, p-value <2e-16 alternative hypothesis: true mean is not equal to 0 99 percent confidence interval: 1771 1876 sample estimates: mean of x 1823

east2010_80 = t.test(x=eastcoast_annualClean$November.2010, conf.level = 0.80)
east2010_80
One Sample t-test

data: eastcoast_annualClean$November.2010 t = 220, df = 3816, p-value <2e-16 alternative hypothesis: true mean is not equal to 0 80 percent confidence interval: 1362 1378 sample estimates: mean of x 1370

east2016_80 = t.test(x=eastcoast_annualClean$November.2016, conf.level = 0.80)
east2016_80
One Sample t-test

data: eastcoast_annualClean$November.2016 t = 204, df = 3825, p-value <2e-16 alternative hypothesis: true mean is not equal to 0 80 percent confidence interval: 1489 1508 sample estimates: mean of x 1498

east2010_99 = t.test(x=eastcoast_annualClean$November.2010, conf.level = 0.99)
east2010_99
One Sample t-test

data: eastcoast_annualClean$November.2010 t = 220, df = 3816, p-value <2e-16 alternative hypothesis: true mean is not equal to 0 99 percent confidence interval: 1354 1386 sample estimates: mean of x 1370

east2016_99 = t.test(x=eastcoast_annualClean$November.2016, conf.level = 0.99)
east2016_99
One Sample t-test

data: eastcoast_annualClean$November.2016 t = 204, df = 3825, p-value <2e-16 alternative hypothesis: true mean is not equal to 0 99 percent confidence interval: 1480 1517 sample estimates: mean of x 1498

\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ Start of Cities of Interest File \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

Load the file

zillow_prices = data.frame(read.csv("price.csv"))
zillow_focus = zillow_prices[c(1:5,7,13,17,23,24,26,113,162),]
zillow_focus = zillow_focus[c(-1,-3,-4,-5,-6)]
zillow_focus

Clean and Transpose

Our cities of interest are NYC (Queens) (only data from December 2011 Onwards), LA, PA, Houston (Harris), Chicago (Cook), Dallas, Las Vegas, San Fran, Detroit (Wayne), Roanoke, Richmond, District of Columbia, Baltimore, Seattle(King). Here, we clean data set and transpose the dates/cities

rownames(zillow_focus) = zillow_focus$City
zillow_focus = zillow_focus[c(-1)]
zillow_focus = as.data.frame(t(zillow_focus))
names(zillow_focus)<-str_replace_all(names(zillow_focus), c(" " = "" , "," = ""))
head(zillow_focus)
##               NewYork LosAngeles Chicago Houston Philadelphia LasVegas
## November.2010      NA       2184    1563    1198         1092     1188
## December.2010      NA       2184    1555    1199         1099     1183
## January.2011       NA       2183    1547    1199         1094     1178
## February.2011      NA       2188    1537    1200         1087     1177
## March.2011         NA       2189    1526    1203         1080     1178
## April.2011         NA       2189    1517    1205         1080     1179
##               SanFrancisco Detroit Seattle Baltimore Washington Richmond
## November.2010         3188     847    1746      1192       2174      892
## December.2010         3207     844    1740      1195       2203      898
## January.2011          3189     832    1736      1203       2241      900
## February.2011         3127     820    1734      1210       2276      902
## March.2011            3040     810    1730      1220       2303      902
## April.2011            2970     806    1722      1228       2320      904
##               Roanoke
## November.2010    1065
## December.2010    1067
## January.2011     1062
## February.2011    1060
## March.2011       1059
## April.2011       1066
str(zillow_focus)
## 'data.frame':    75 obs. of  13 variables:
##  $ NewYork     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ LosAngeles  : int  2184 2184 2183 2188 2189 2189 2188 2191 2189 2186 ...
##  $ Chicago     : int  1563 1555 1547 1537 1526 1517 1507 1497 1493 1491 ...
##  $ Houston     : int  1198 1199 1199 1200 1203 1205 1204 1199 1194 1190 ...
##  $ Philadelphia: int  1092 1099 1094 1087 1080 1080 1083 1082 1082 1085 ...
##  $ LasVegas    : int  1188 1183 1178 1177 1178 1179 1176 1170 1163 1158 ...
##  $ SanFrancisco: int  3188 3207 3189 3127 3040 2970 2916 2879 2848 2848 ...
##  $ Detroit     : int  847 844 832 820 810 806 802 800 797 796 ...
##  $ Seattle     : int  1746 1740 1736 1734 1730 1722 1713 1709 1708 1706 ...
##  $ Baltimore   : int  1192 1195 1203 1210 1220 1228 1233 1230 1223 1214 ...
##  $ Washington  : int  2174 2203 2241 2276 2303 2320 2323 2318 2318 2322 ...
##  $ Richmond    : int  892 898 900 902 902 904 909 917 933 951 ...
##  $ Roanoke     : int  1065 1067 1062 1060 1059 1066 1070 1070 1062 1049 ...

Turn date into a column

zillow_focus <- cbind(Date = rownames(zillow_focus), zillow_focus)
rownames(zillow_focus) <- 1:nrow(zillow_focus)
head(zillow_focus)
##            Date NewYork LosAngeles Chicago Houston Philadelphia LasVegas
## 1 November.2010      NA       2184    1563    1198         1092     1188
## 2 December.2010      NA       2184    1555    1199         1099     1183
## 3  January.2011      NA       2183    1547    1199         1094     1178
## 4 February.2011      NA       2188    1537    1200         1087     1177
## 5    March.2011      NA       2189    1526    1203         1080     1178
## 6    April.2011      NA       2189    1517    1205         1080     1179
##   SanFrancisco Detroit Seattle Baltimore Washington Richmond Roanoke
## 1         3188     847    1746      1192       2174      892    1065
## 2         3207     844    1740      1195       2203      898    1067
## 3         3189     832    1736      1203       2241      900    1062
## 4         3127     820    1734      1210       2276      902    1060
## 5         3040     810    1730      1220       2303      902    1059
## 6         2970     806    1722      1228       2320      904    1066

Convert the Date column from chr to date, and Total

Here, we are converting strings to dates and to %d/%m/%Y form with 01 for %d

zillow_focus_test = zillow_focus
zillow_focus_test$Date <- str_replace_all(zillow_focus_test$Date, "[.]", " ")
zillow_focus_test$Month <- str_extract(zillow_focus_test$Date, "(\\w+)")
zillow_focus_test$Year <- str_extract(zillow_focus_test$Date, "\\w+$")
tmp_date = paste(match(zillow_focus_test$Month, month.name), zillow_focus_test$Year, sep="/")
tmp_date = paste("01", tmp_date, sep="/")
tmp_date = as.Date(tmp_date, "%d/%m/%Y")
zillow_focus$Date <- tmp_date
head(zillow_focus)
##         Date NewYork LosAngeles Chicago Houston Philadelphia LasVegas
## 1 2010-11-01      NA       2184    1563    1198         1092     1188
## 2 2010-12-01      NA       2184    1555    1199         1099     1183
## 3 2011-01-01      NA       2183    1547    1199         1094     1178
## 4 2011-02-01      NA       2188    1537    1200         1087     1177
## 5 2011-03-01      NA       2189    1526    1203         1080     1178
## 6 2011-04-01      NA       2189    1517    1205         1080     1179
##   SanFrancisco Detroit Seattle Baltimore Washington Richmond Roanoke
## 1         3188     847    1746      1192       2174      892    1065
## 2         3207     844    1740      1195       2203      898    1067
## 3         3189     832    1736      1203       2241      900    1062
## 4         3127     820    1734      1210       2276      902    1060
## 5         3040     810    1730      1220       2303      902    1059
## 6         2970     806    1722      1228       2320      904    1066
str(zillow_focus)
## 'data.frame':    75 obs. of  14 variables:
##  $ Date        : Date, format: "2010-11-01" "2010-12-01" ...
##  $ NewYork     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ LosAngeles  : int  2184 2184 2183 2188 2189 2189 2188 2191 2189 2186 ...
##  $ Chicago     : int  1563 1555 1547 1537 1526 1517 1507 1497 1493 1491 ...
##  $ Houston     : int  1198 1199 1199 1200 1203 1205 1204 1199 1194 1190 ...
##  $ Philadelphia: int  1092 1099 1094 1087 1080 1080 1083 1082 1082 1085 ...
##  $ LasVegas    : int  1188 1183 1178 1177 1178 1179 1176 1170 1163 1158 ...
##  $ SanFrancisco: int  3188 3207 3189 3127 3040 2970 2916 2879 2848 2848 ...
##  $ Detroit     : int  847 844 832 820 810 806 802 800 797 796 ...
##  $ Seattle     : int  1746 1740 1736 1734 1730 1722 1713 1709 1708 1706 ...
##  $ Baltimore   : int  1192 1195 1203 1210 1220 1228 1233 1230 1223 1214 ...
##  $ Washington  : int  2174 2203 2241 2276 2303 2320 2323 2318 2318 2322 ...
##  $ Richmond    : int  892 898 900 902 902 904 909 917 933 951 ...
##  $ Roanoke     : int  1065 1067 1062 1060 1059 1066 1070 1070 1062 1049 ...

Add a total column, not including New York (which has NAs)

zillow_total = zillow_focus
zillow_total$Total <- rowSums(zillow_total[3:14])
head(zillow_total)
##         Date NewYork LosAngeles Chicago Houston Philadelphia LasVegas
## 1 2010-11-01      NA       2184    1563    1198         1092     1188
## 2 2010-12-01      NA       2184    1555    1199         1099     1183
## 3 2011-01-01      NA       2183    1547    1199         1094     1178
## 4 2011-02-01      NA       2188    1537    1200         1087     1177
## 5 2011-03-01      NA       2189    1526    1203         1080     1178
## 6 2011-04-01      NA       2189    1517    1205         1080     1179
##   SanFrancisco Detroit Seattle Baltimore Washington Richmond Roanoke Total
## 1         3188     847    1746      1192       2174      892    1065 18329
## 2         3207     844    1740      1195       2203      898    1067 18374
## 3         3189     832    1736      1203       2241      900    1062 18364
## 4         3127     820    1734      1210       2276      902    1060 18318
## 5         3040     810    1730      1220       2303      902    1059 18240
## 6         2970     806    1722      1228       2320      904    1066 18186

Line plot of total

Line plots of time series, one with only time

ggplot(zillow_total, aes(x=Date, y=Total)) + geom_point() + geom_line() + labs(x="Time", y="Total Rent Prices", title="Plot of City Rent Change over Time")

Plots of the Cities and Prices

Melt the ggplot by the cities

zillow_melt <- melt(zillow_focus, id = "Date")
names(zillow_melt)[2] <- "City"
names(zillow_melt)[3] <- "Prices"
head(zillow_melt)
##         Date    City Prices
## 1 2010-11-01 NewYork     NA
## 2 2010-12-01 NewYork     NA
## 3 2011-01-01 NewYork     NA
## 4 2011-02-01 NewYork     NA
## 5 2011-03-01 NewYork     NA
## 6 2011-04-01 NewYork     NA

I’m not sure if histograms would be meaningful in the context of my part, but here’s one of all the cities melted together, ignoring New York again. We can do more if you guys believe it’s necessary. There is a huge skew because the bigger/more expensive cities chosen would have higher rent prices due to a difference in the cost of living. I’ll also do the Mean/Median/Quartiles of the overall plots here.

ggplot(zillow_melt, aes(x=Prices, na.rm=TRUE)) + geom_histogram(binwidth=300)

ggplot(zillow_melt, aes(x=Prices, na.rm=TRUE)) + geom_boxplot()

summary(zillow_melt$Prices)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     743    1106    1319    1671    2210    4547      13

Lines of the time series over their changes.

ggplot(zillow_melt, aes(x=Date, y=Prices, group=City, color=City, na.rm=TRUE)) + geom_point() + geom_line() + labs(x="Time", y="Rent Price", title="Plot of City Rent Prices over Time")

First Order change plots

Find the first order change, difference between each month

zillow_change <- as.data.frame(lapply(zillow_focus[2:14], diff, lag=1))
zillow_change$Date <- zillow_focus$Date[2:75]
zillow_change <- zillow_change %>% dplyr::select(Date, everything())
head(zillow_change)
##         Date NewYork LosAngeles Chicago Houston Philadelphia LasVegas
## 1 2010-12-01      NA          0      -8       1            7       -5
## 2 2011-01-01      NA         -1      -8       0           -5       -5
## 3 2011-02-01      NA          5     -10       1           -7       -1
## 4 2011-03-01      NA          1     -11       3           -7        1
## 5 2011-04-01      NA          0      -9       2            0        1
## 6 2011-05-01      NA         -1     -10      -1            3       -3
##   SanFrancisco Detroit Seattle Baltimore Washington Richmond Roanoke
## 1           19      -3      -6         3         29        6       2
## 2          -18     -12      -4         8         38        2      -5
## 3          -62     -12      -2         7         35        2      -2
## 4          -87     -10      -4        10         27        0      -1
## 5          -70      -4      -8         8         17        2       7
## 6          -54      -4      -9         5          3        5       4

Melt, and line graph of the changes for each city. The graph is kind of hard to visualize, but it looks like San Francisco had the highest change from time to time.

zillow_cmelt <- melt(zillow_change, id = "Date")
names(zillow_cmelt)[2] <- "City"
names(zillow_cmelt)[3] <- "PriceChange"
ggplot(zillow_cmelt, aes(x=Date, y=PriceChange, group=City, color=City, na.rm=TRUE)) + geom_point() + geom_line() + labs(x="Time", y="Price Change", title="Plot of City Rent Change over Time")

Cumulative change, to make the trends more clear in respect to each other

zillow_cumulative <- as.data.frame(cumsum(zillow_change[2:14]))
zillow_cumulative$Date <- zillow_focus$Date[2:75]
zillow_cumulative <- zillow_cumulative %>% dplyr::select(Date, everything())
head(zillow_cumulative)
##         Date NewYork LosAngeles Chicago Houston Philadelphia LasVegas
## 1 2010-12-01      NA          0      -8       1            7       -5
## 2 2011-01-01      NA         -1     -16       1            2      -10
## 3 2011-02-01      NA          4     -26       2           -5      -11
## 4 2011-03-01      NA          5     -37       5          -12      -10
## 5 2011-04-01      NA          5     -46       7          -12       -9
## 6 2011-05-01      NA          4     -56       6           -9      -12
##   SanFrancisco Detroit Seattle Baltimore Washington Richmond Roanoke
## 1           19      -3      -6         3         29        6       2
## 2            1     -15     -10        11         67        8      -3
## 3          -61     -27     -12        18        102       10      -5
## 4         -148     -37     -16        28        129       10      -6
## 5         -218     -41     -24        36        146       12       1
## 6         -272     -45     -33        41        149       17       5

Another graph, this time of cumulative change. San Francisco had the greatest change, while Seattle had the second largest. Detroit had the most negative change, falling below 0 overall.

zillow_cumumelt <- melt(zillow_cumulative, id = "Date")
names(zillow_cumumelt)[2] <- "City"
names(zillow_cumumelt)[3] <- "CumulativePriceChange"
ggplot(zillow_cumumelt, aes(x=Date, y=CumulativePriceChange, group=City, color=City, na.rm=TRUE)) + geom_point() + geom_line() + labs(x="Time", y="Cumulative Change", title="Plot of Cumulative City Rent Change over Time")

First Order change, as a percentage proportion of the total rent

Find the first order change as a Percent

zillow_proportion <- zillow_change[2:14]/zillow_focus[c(2:75), 2:14]*100
zillow_proportion$Date <- zillow_focus$Date[2:75]
zillow_proportion <- zillow_proportion %>% dplyr::select(Date, everything())
head(zillow_proportion)
##         Date NewYork LosAngeles Chicago Houston Philadelphia LasVegas
## 1 2010-12-01      NA     0.0000  -0.514  0.0834        0.637  -0.4227
## 2 2011-01-01      NA    -0.0458  -0.517  0.0000       -0.457  -0.4244
## 3 2011-02-01      NA     0.2285  -0.651  0.0833       -0.644  -0.0850
## 4 2011-03-01      NA     0.0457  -0.721  0.2494       -0.648   0.0849
## 5 2011-04-01      NA     0.0000  -0.593  0.1660        0.000   0.0848
## 6 2011-05-01      NA    -0.0457  -0.664 -0.0831        0.277  -0.2551
##   SanFrancisco Detroit Seattle Baltimore Washington Richmond Roanoke
## 1        0.592  -0.355  -0.345     0.251      1.316    0.668  0.1874
## 2       -0.564  -1.442  -0.230     0.665      1.696    0.222 -0.4708
## 3       -1.983  -1.463  -0.115     0.579      1.538    0.222 -0.1887
## 4       -2.862  -1.235  -0.231     0.820      1.172    0.000 -0.0944
## 5       -2.357  -0.496  -0.465     0.651      0.733    0.221  0.6567
## 6       -1.852  -0.499  -0.525     0.406      0.129    0.550  0.3738

It looks like Richmond has a pretty varied proportional change. It’s still hard to make anything out of it, so let’s move to the cumulative proportional change again.

zillow_pmelt <- melt(zillow_proportion, id = "Date")
names(zillow_pmelt)[2] <- "City"
names(zillow_pmelt)[3] <- "ProportionChange"
ggplot(zillow_pmelt, aes(x=Date, y=ProportionChange, group=City, color=City, na.rm=TRUE)) + geom_point() + geom_line() + labs(x="Time", y="Proportional Change", title="Plot of City Rent Change over Time")

Quick histogram of the distribution of proportional changes. The proportional changes over time look pretty normal taken together as a whole, with maybe a slight right skew, which makes sense as we would guess that rent prices should generally increase over time with inflation and what not. Remember, these changes are in percentages. There are outliers on both sides of the boxplot, which matches what we see in the histogram. Here are the measures of variance and qqnorm plots as well

ggplot(zillow_pmelt, aes(x=ProportionChange, na.rm=TRUE)) + geom_histogram() + labs(x = "Proportional Change in Percentages")

ggplot(zillow_pmelt, aes(x=ProportionChange, na.rm=TRUE)) + geom_boxplot()

qqnorm(zillow_pmelt$ProportionChange)

summary(zillow_pmelt$ProportionChange)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   -2.90   -0.13    0.18    0.20    0.54    2.45      13
var(zillow_pmelt$ProportionChange, na.rm=TRUE)
## [1] 0.362
sd(zillow_pmelt$ProportionChange, na.rm=TRUE)
## [1] 0.602

Just making the cumulative melt for proportions now. We will drop New York before plotting the values, as there are too many NAs.

zillow_pdrop <- zillow_proportion[3:14]
zillow_pcumulative <- as.data.frame(cumsum(zillow_pdrop))
zillow_pcumulative$Date <- zillow_focus$Date[2:75]
zillow_pcumulative <- zillow_pcumulative %>% dplyr::select(Date, everything())
head(zillow_pcumulative)
##         Date LosAngeles Chicago Houston Philadelphia LasVegas SanFrancisco
## 1 2010-12-01     0.0000  -0.514  0.0834        0.637   -0.423        0.592
## 2 2011-01-01    -0.0458  -1.032  0.0834        0.180   -0.847        0.028
## 3 2011-02-01     0.1827  -1.682  0.1667       -0.464   -0.932       -1.955
## 4 2011-03-01     0.2284  -2.403  0.4161       -1.112   -0.847       -4.817
## 5 2011-04-01     0.2284  -2.996  0.5821       -1.112   -0.762       -7.173
## 6 2011-05-01     0.1827  -3.660  0.4990       -0.835   -1.017       -9.025
##   Detroit Seattle Baltimore Washington Richmond Roanoke
## 1  -0.355  -0.345     0.251       1.32    0.668  0.1874
## 2  -1.798  -0.575     0.916       3.01    0.890 -0.2834
## 3  -3.261  -0.691     1.495       4.55    1.112 -0.4720
## 4  -4.496  -0.922     2.314       5.72    1.112 -0.5665
## 5  -4.992  -1.386     2.966       6.45    1.333  0.0902
## 6  -5.491  -1.912     3.371       6.58    1.883  0.4640

We will drop New York before plotting the values, as there are too many NAs. Now, it looks like the proportional change of Seattle is a bit higher than San Francisco, and Richmond is up there too. Detroit still has the most negative proportional change overall.

zillow_pcumumelt <- melt(zillow_pcumulative, id = "Date")
names(zillow_pcumumelt)[2] <- "City"
names(zillow_pcumumelt)[3] <- "CumulativeProportionChange"
ggplot(zillow_pcumumelt, aes(x=Date, y=CumulativeProportionChange, group=City, color=City, na.rm=TRUE)) + geom_point() + geom_line() + labs(x="Time", y="Cumu. Prop. Change", title="Plot of Cumulative City Rent Change over Time")

Box plot of each city’s proportional change. As expected, San Francisco and Roanoke seem to have the largest ranges and variances. We are also dropping New York again.

zillow_prodrop = zillow_proportion[-2]
zillow_pdmelt <- melt(zillow_prodrop, id = "Date")
names(zillow_pdmelt)[2] <- "City"
names(zillow_pdmelt)[3] <- "ProportionChange"
ggplot(zillow_pdmelt, aes(x=City, y=ProportionChange, color=City, na.rm=TRUE)) + geom_boxplot(outlier.shape=8, outlier.size=4)

Anova Test and Post-hoc Tukey HSD

We will now anova test the proportional changes against each other, and see if the average change between years is different. Since we have a p-value less than our alpha, there are significant differences in changes, and thus, we look at the Post-hoc Tukey HSD. It looks like the pairs that have significantly different average proportional changes are [Detroit-LosAngeles], [Roanoke-LosAngeles], [SanFrancisco-Chicago], [Seattle-Chicago], [Richmond-Chicago], [Detroit-Houston], [Seattle-Philadelphia], [SanFrancisco-LasVegas], [Seattle-LasVegas], [Richmond-LasVegas], [Detroit-SanFrancisco], [Roanoke-SanFrancisco], [Seattle-Detroit], [Washington-Detroit], [Richmond-Detroit], [Baltimore-Seattle], [Roanoke-Seattle], and finally, [Roanoke-Richmond].

pm_anova = aov(zillow_pdmelt$ProportionChange ~ zillow_pdmelt$City)
summary(pm_anova)
##                     Df Sum Sq Mean Sq F value  Pr(>F)    
## zillow_pdmelt$City  11   31.4   2.855    8.52 2.1e-14 ***
## Residuals          876  293.6   0.335                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tukeyAoV <- TukeyHSD(pm_anova)
tukeyAoV
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = zillow_pdmelt$ProportionChange ~ zillow_pdmelt$City)
## 
## $`zillow_pdmelt$City`
##                               diff     lwr      upr p adj
## Chicago-LosAngeles        -0.25457 -0.5665  0.05734 0.241
## Houston-LosAngeles        -0.07357 -0.3855  0.23834 1.000
## Philadelphia-LosAngeles   -0.17203 -0.4839  0.13988 0.814
## LasVegas-LosAngeles       -0.25543 -0.5673  0.05648 0.236
## SanFrancisco-LosAngeles    0.11176 -0.2001  0.42366 0.991
## Detroit-LosAngeles        -0.47720 -0.7891 -0.16529 0.000
## Seattle-LosAngeles         0.16025 -0.1517  0.47215 0.876
## Baltimore-LosAngeles      -0.18919 -0.5011  0.12272 0.702
## Washington-LosAngeles     -0.08766 -0.3996  0.22424 0.999
## Richmond-LosAngeles        0.08187 -0.2300  0.39377 0.999
## Roanoke-LosAngeles        -0.37085 -0.6828 -0.05895 0.006
## Houston-Chicago            0.18100 -0.1309  0.49291 0.758
## Philadelphia-Chicago       0.08254 -0.2294  0.39445 0.999
## LasVegas-Chicago          -0.00086 -0.3128  0.31104 1.000
## SanFrancisco-Chicago       0.36633  0.0544  0.67823 0.007
## Detroit-Chicago           -0.22263 -0.5345  0.08928 0.450
## Seattle-Chicago            0.41482  0.1029  0.72672 0.001
## Baltimore-Chicago          0.06538 -0.2465  0.37729 1.000
## Washington-Chicago         0.16691 -0.1450  0.47881 0.842
## Richmond-Chicago           0.33644  0.0245  0.64834 0.022
## Roanoke-Chicago           -0.11628 -0.4282  0.19562 0.987
## Philadelphia-Houston      -0.09846 -0.4104  0.21344 0.997
## LasVegas-Houston          -0.18186 -0.4938  0.13004 0.752
## SanFrancisco-Houston       0.18533 -0.1266  0.49723 0.729
## Detroit-Houston           -0.40363 -0.7155 -0.09172 0.001
## Seattle-Houston            0.23381 -0.0781  0.54572 0.369
## Baltimore-Houston         -0.11562 -0.4275  0.19628 0.988
## Washington-Houston        -0.01409 -0.3260  0.29781 1.000
## Richmond-Houston           0.15544 -0.1565  0.46734 0.897
## Roanoke-Houston           -0.29728 -0.6092  0.01462 0.079
## LasVegas-Philadelphia     -0.08340 -0.3953  0.22850 0.999
## SanFrancisco-Philadelphia  0.28379 -0.0281  0.59569 0.116
## Detroit-Philadelphia      -0.30517 -0.6171  0.00673 0.062
## Seattle-Philadelphia       0.33227  0.0204  0.64418 0.025
## Baltimore-Philadelphia    -0.01716 -0.3291  0.29474 1.000
## Washington-Philadelphia    0.08437 -0.2275  0.39627 0.999
## Richmond-Philadelphia      0.25390 -0.0580  0.56580 0.244
## Roanoke-Philadelphia      -0.19882 -0.5107  0.11308 0.631
## SanFrancisco-LasVegas      0.36719  0.0553  0.67909 0.007
## Detroit-LasVegas          -0.22177 -0.5337  0.09014 0.456
## Seattle-LasVegas           0.41568  0.1038  0.72758 0.001
## Baltimore-LasVegas         0.06624 -0.2457  0.37815 1.000
## Washington-LasVegas        0.16777 -0.1441  0.47967 0.838
## Richmond-LasVegas          0.33730  0.0254  0.64920 0.021
## Roanoke-LasVegas          -0.11542 -0.4273  0.19648 0.988
## Detroit-SanFrancisco      -0.58895 -0.9009 -0.27705 0.000
## Seattle-SanFrancisco       0.04849 -0.2634  0.36039 1.000
## Baltimore-SanFrancisco    -0.30095 -0.6129  0.01096 0.070
## Washington-SanFrancisco   -0.19942 -0.5113  0.11248 0.626
## Richmond-SanFrancisco     -0.02989 -0.3418  0.28202 1.000
## Roanoke-SanFrancisco      -0.48261 -0.7945 -0.17070 0.000
## Seattle-Detroit            0.63744  0.3255  0.94935 0.000
## Baltimore-Detroit          0.28801 -0.0239  0.59991 0.103
## Washington-Detroit         0.38953  0.0776  0.70144 0.003
## Richmond-Detroit           0.55907  0.2472  0.87097 0.000
## Roanoke-Detroit            0.10635 -0.2056  0.41825 0.994
## Baltimore-Seattle         -0.34943 -0.6613 -0.03753 0.014
## Washington-Seattle        -0.24791 -0.5598  0.06400 0.279
## Richmond-Seattle          -0.07838 -0.3903  0.23353 1.000
## Roanoke-Seattle           -0.53110 -0.8430 -0.21919 0.000
## Washington-Baltimore       0.10153 -0.2104  0.41343 0.996
## Richmond-Baltimore         0.27106 -0.0408  0.58296 0.162
## Roanoke-Baltimore         -0.18166 -0.4936  0.13024 0.754
## Richmond-Washington        0.16953 -0.1424  0.48144 0.828
## Roanoke-Washington        -0.28319 -0.5951  0.02872 0.118
## Roanoke-Richmond          -0.45272 -0.7646 -0.14082 0.000

State wise countplot

ggplot(zillow, aes(x=State)) + geom_bar(colour="blue", fill="purple", alpha=0.6)

Time-series plot for top 10 cities

num_city = 10
values=head(zillow, num_city, )
values=data.frame(t(as.matrix(values[,7:81])))
colnames(values)=zillow[1:num_city,2]

date = seq(as.Date("2010/11/01"), as.Date("2017/01/31"), "month")
date = as.yearmon(date)
ts=zoo(values,order.by = date)
values=fortify(ts)
values$Index=as.Date(values$Index)

autoplot(ts,facets = NULL)+ geom_point(size=0.5) +
  theme_minimal()+
  labs(x="Time",y="Price")

Pricing Distribution for California (Box plots)

price = gather(data = zillow, "Month", "Price", 7:81, factor_key = T)

ggplot(price[price$State == "CA" & !is.na(price$Metro),]) +
  geom_boxplot(aes(x = fct_rev(Metro), 
                   y = as.numeric(Price)), fill="#FF9999", color="#56B4E9", outlier.size = 0.5) +
  labs(x = "Metro Area", y = "Price", #note these are opposite because I use coord_flip to flip the axes
       colour = "#E0E0E0") +
  coord_flip()

Pricing Distribution for Philadelphia (Box plots)

price = gather(data = zillow, "Month", "Price", 7:81, factor_key = T)

ggplot(price[price$State == "PA" & !is.na(price$Metro),]) +
  geom_boxplot(aes(x = fct_rev(Metro), 
                   y = as.numeric(Price)), fill="#9999CC", color="#66CC99", outlier.size = 0.5) +
  labs(x = "Metro Area", y = "Price", #note these are opposite because I use coord_flip to flip the axes
       colour = "#E0E0E0") +
  coord_flip()

Dallas county rent prices (point plot across time)

# price
ggplot(price[price$County == "Dallas" & !is.na(price$City),], 
               aes(y = fct_rev(City), x = Price)) +
  labs(title = "Zillow | Dallas County Rent Prices",
       subtitle = "2010 - 2017", 
       x = "Price", y = "Dallas County", 
       colour = "Time") +
  geom_point(shape = 20, alpha = 0.4, size = 3, aes(color = Month)) +
  scale_color_discrete(l = 45, h = c(30, 330)) 

Los Angeles county rent prices (point plot across time)

ggplot(price[price$County == "Los Angeles" & !is.na(price$City),], 
               aes(y = fct_rev(City), x = Price)) +
  labs(title = "Zillow | Los Angeles County Rent Prices",
       subtitle = "2010 - 2017", 
       x = "Price", y = "LA County", 
       colour = "Time") +
  geom_point(shape = 16, alpha = 0.4, size = 3, aes(color = Month)) +
  scale_colour_viridis_d()

  # scale_colour_gradientn(colours=rainbow(4))

Average rent pricing with minimum maximum deviation

# Create variable of numeric year
price$Year = as.character(price$Month)
price$Year = as.numeric(substr(price$Month, nchar(as.character(price$Month)) - 3, nchar(as.character(price$Month))))

# Calculate range for each state, by year
states = price[!is.na(price$State),] %>%
  group_by(State, Year) %>%
  summarise(Mean = round(mean(Price),0),
            Min = min(Price),
            Max = max(Price)) 

# Plot change over time, by state.
ggplot(states[!is.na(states$Mean),], 
                aes(x = fct_rev(State),
                    y = as.numeric(Mean))) +
  labs(title = "Zillow | Mean US Rent Prices",
       subtitle = "2010 - 2017", 
       x = "States", y = "Average Price", 
       colour = "#E0E0E0") +
  geom_point(shape = 20, alpha = 0.8, size = 5, aes(color = Year)) +
  scale_color_continuous(aes(guide = ""), low = "black", high = "purple") + 
  guides(fill = guide_colourbar(barwidth = 0.7, barheight = 15)) +
  coord_flip() 

Hex plot with number of counties for top 5 cities

library("zoo")
library(ggfortify)
library(reshape2)

num_city = 5
values=head(zillow, num_city, )
values=data.frame(t(as.matrix(values[,7:81])))
colnames(values)=zillow[1:num_city,2]

date = seq(as.Date("2010/11/01"), as.Date("2017/01/31"), "month")
date = as.yearmon(date)
ts=zoo(values,order.by = date)
values=fortify(ts)
values$Index=as.Date(values$Index)

autoplot(ts,facets = NULL)+ geom_hex(size=1.5, alpha=0.7) + scale_fill_viridis_c() + 
  guides(fill = guide_colourbar(barwidth = 0.7, barheight = 15)) +
  theme_minimal()+ 
  labs(x="Time",y="Price")